pacman::p_load(tidyverse,data.table,broom,latex2exp,ggpubr,kableExtra,R.utils,estimatr,modelsummary)
rm(list=ls())
options("modelsummary_format_numeric_latex" = "plain")
################################################################################

df_b_br = fread(paste0("../../data/agribusiness/clean/beef_brazil.csv.gz"))
df_s_br = fread(paste0("../../data/agribusiness/clean/soybean_brazil.csv.gz"))
df_m_br = fread(paste0("../../data/agribusiness/clean/maize_brazil.csv.gz"))
df_s_ar = fread(paste0("../../data/agribusiness/clean/soybean_argentina.csv.gz"))

########### Counts

df_b_br[, lapply(.SD, uniqueN), by=.(year), .SDcols=c('county_id','state','hub','port','destination','exporter','exporter_group','importer','importer_group','product_type')]
df_s_br[, lapply(.SD, uniqueN), by=.(year), .SDcols=c('county_id','state','hub','port','destination','exporter','exporter_group','importer','importer_group','product_type')]
df_m_br[, lapply(.SD, uniqueN), by=.(year), .SDcols=c('county_id','state','hub','port','destination','exporter','exporter_group','importer','importer_group','product_type')]
df_s_ar[, lapply(.SD, uniqueN), by=.(year), .SDcols=c('county_id','state','hub','port','destination','exporter','exporter_group','product_type')]

df_b_br$spatial_id = df_b_br$county_id
df_s_br$spatial_id = df_s_br$county_id
df_m_br$spatial_id = df_m_br$county_id
df_s_ar$spatial_id = df_s_ar$county_id

########### Concentration measures

year_list = c(2017)
id_type_list = c('county_known')
var_list = c('country','commodity','spatial_id','state','hub','port','destination','exporter','exporter_group','product_type','equivalent_tonnes','fob_usd','id_type')

df = rbind(df_b_br[year %in% year_list & id_type %in% id_type_list, ..var_list],df_s_br[year %in% year_list & id_type %in% id_type_list, ..var_list])
df = rbind(df,df_m_br[year %in% year_list & id_type %in% id_type_list, ..var_list])[destination!='BRAZIL',]
df = rbind(df,df_s_ar[year %in% year_list & id_type %in% id_type_list, ..var_list])
df_main = df[exporter_group!='DOMESTIC CONSUMPTION',]

#Nationwide shares
df_x = df_main[, lapply(.SD, sum), by=.(country, commodity, exporter_group), .SDcols=c('equivalent_tonnes','fob_usd')]
df_y = df_main[, lapply(.SD, sum), by=.(country, commodity), .SDcols=c('equivalent_tonnes','fob_usd')]
df = merge(df_x, df_y, by=c('country','commodity'))
df[, c('share_q','share_v') := list( equivalent_tonnes.x/equivalent_tonnes.y, fob_usd.x/fob_usd.y ) ]
df_n_s = df[, list(country,commodity,exporter_group,share_q,share_v)]

#CR-1
df = df_n_s %>% group_by(country, commodity) %>% top_n(1, share_q)
setDT(df)
df[, n_cr1:=share_q]
df_n_cr1 = df[, list(country,commodity,n_cr1)]

#CR-3
df = df_n_s %>% group_by(country, commodity) %>% top_n(3, share_q)
setDT(df)
df = df[, lapply(.SD, sum), by=.(country, commodity), .SDcols=c('share_q','share_v')]
df[, n_cr3:=share_q]
df_n_cr3 = df[, list(country,commodity,n_cr3)]

#County-level shares
df_x = df_main[, lapply(.SD, sum), by=.(country, commodity, spatial_id, exporter_group), .SDcols=c('equivalent_tonnes','fob_usd')]
df_y = df_main[, lapply(.SD, sum), by=.(country, spatial_id, commodity), .SDcols=c('equivalent_tonnes','fob_usd')]
df = merge(df_x, df_y, by=c('country','commodity','spatial_id'))
df[, c('share_q','share_v') := list( equivalent_tonnes.x/equivalent_tonnes.y, fob_usd.x/fob_usd.y ) ]
df_c_s = df[, list(country,commodity,spatial_id,exporter_group,share_q,share_v)]

#CR-1
df = df_c_s %>% group_by(country, commodity, spatial_id) %>% top_n(1, share_q)
setDT(df)
df = df[, lapply(.SD, median), by=.(country, commodity), .SDcols=c('share_q','share_v')]
df[, c_cr1:=share_q]
df_c_cr1 = df[, list(country,commodity,c_cr1)]

#CR-3
df = df_c_s %>% group_by(country, commodity, spatial_id) %>% top_n(3, share_q)
setDT(df)
df = df[, lapply(.SD, sum), by=.(country, commodity, spatial_id), .SDcols=c('share_q','share_v')]
df = df[, lapply(.SD, median), by=.(country, commodity), .SDcols=c('share_q','share_v')]
df[, c_cr3:=share_q]
df_c_cr3 = df[, list(country,commodity,c_cr3)]

#Share with 1 firm
df = df_c_s %>% group_by(country, commodity, spatial_id) %>% top_n(1, share_q)
setDT(df)
df_n1 = df[share_q==1, lapply(.SD, uniqueN), by=.(country, commodity), .SDcols=c('spatial_id')]
df_n = df[, lapply(.SD, uniqueN), by=.(country, commodity), .SDcols=c('spatial_id')]
df = merge(df_n1, df_n, by=c('country','commodity'))
df[, c('share_1') := list( spatial_id.x/spatial_id.y ) ]
df_c_s1 = df[, list(country,commodity,share_1)]

#Firms
df_f = df_main[, lapply(.SD, uniqueN), by=.(country, commodity), .SDcols=c('exporter_group')]


########### Upstream agricultural establishments

df = fread("../../data/size/clean/brazil_size_distribution_land.csv.gz")
df = df[year==2017 & variable=='establishments' & group_type=='activity' & group_item=='livestock',]
df = df[, lapply(.SD, sum), .SDcols=c('value')]
df[, c('country','commodity') := list('BRAZIL','beef') ]
setnames(df, c('value'), c('establishments'))
df_b = df

df = fread("../../data/size/clean/brazil_size_distribution_land.csv.gz")
df = df[year==2017 & variable=='establishments' & group_item=='maize',]
df = df[, lapply(.SD, sum), .SDcols=c('value')]
df[, c('country','commodity') := list('BRAZIL','maize') ]
setnames(df, c('value'), c('establishments'))
df_m = df

df = fread("../../data/size/clean/brazil_size_distribution_land.csv.gz")
df = df[year==2017 & variable=='establishments' & group_item=='soybean',]
df = df[, lapply(.SD, sum), .SDcols=c('value')]
df[, c('country','commodity') := list('BRAZIL','soybean') ]
setnames(df, c('value'), c('establishments'))
df_s = df

df = rbind(df_b, df_s)
df = rbind(df, df_m)

df_s_a = df_s
df_s_a[,country:='ARGENTINA']
df_s_a[,establishments:=42428] #(census value)

df = rbind(df, df_s_a)
df_u = df

########### Table - Concentration

df = merge(df_u, df_f, by=c('country','commodity'))
df = merge(df, df_n_cr1, by=c('country','commodity'))
df = merge(df, df_n_cr3, by=c('country','commodity'))
df = merge(df, df_c_cr1, by=c('country','commodity'))
df = merge(df, df_c_cr3, by=c('country','commodity'))
df = merge(df, df_c_s1, by=c('country','commodity'))
setorderv(df, cols='country', order=-1)
df[,5:9] = round(df[,5:9], digits=2)
tab = data.table(rownames = c('country','commodity','Number of agricultural establishments (sellers)','Number of agribusiness firms (buyers)',
                              'CR-1 (national market)','CR-3 (national market)','CR-1 (median across local upstream markets)','CR-3 (median across local upstream markets)',
                              'Share of upstream markets with only 1 agribusiness firm'), t(df))
tab = tab[-c(1,2),]
setnames(tab, c('rownames','V1','V2','V3','V4'), c('','Beef', 'Maize','Soybean','Soybean'))
export_tab = knitr::kable(tab, booktabs = T, align = "lcccc", format = 'latex', linesep = c("","\\addlinespace"),
                 caption='Agribusiness concentration in upstream markets.', label=' trase - concentration', 
                 format.args = list(big.mark = ",")) %>%
  kable_styling(latex_options = c("hold_position")) %>%
  add_header_above(c(" " = 1, "Brazil" = 3, "Argentina" = 1))
kableExtra::save_kable(export_tab, file = "../../output/tables/table1_concentration.tex")


########### Correlations

for(spatial_unit in c('mesoregion_id','microregion_id','amc_id') ){
  
  #Distance matrix
  df = fread(paste0("../../data/agribusiness/clean/beef_brazil_county_hub_distance.csv.gz"))[year %in% year_list,][product_type %in% c("BEEF BONELESS","BEEF OFFALS","BEEF DRIED SALTED SMOKED","CATTLE"),]
  setnames(df, old=spatial_unit, new='spatial_id')
  
  #Shares and market access
  df_x = df[!is.na(spatial_id) & !is.na(county_id_hub),list(spatial_id, county_id_hub, product_type, distance_km)][, lapply(.SD, mean), by=.(spatial_id, county_id_hub, product_type),]
  df_ij = df[!is.na(spatial_id) & !is.na(county_id_hub),list(spatial_id, county_id_hub, product_type, equivalent_tonnes)][, lapply(.SD, sum), by=.(spatial_id, county_id_hub, product_type),]
  df_i = df_ij[, list(spatial_id, product_type, equivalent_tonnes)][, lapply(.SD, sum), by=.(spatial_id,product_type),]
  df_y = merge(df_ij,df_i, by=c('spatial_id','product_type'))[, c('s_ij') := list( equivalent_tonnes.x/equivalent_tonnes.y)][,list(spatial_id, county_id_hub, product_type, s_ij)]
  df_d = merge(df_x,df_y,by=c('spatial_id','county_id_hub','product_type'))
  df_d[, c('distance','distance_w') := list(1/(distance_km+1), s_ij/(distance_km+1) )]
  df_hub_reliance = df_d
  
  df_j1 = df_ij[, list(county_id_hub, product_type, equivalent_tonnes)][, lapply(.SD, sum), by=.(county_id_hub,product_type),]
  df_j2 = df_ij[, list(product_type, equivalent_tonnes)][, lapply(.SD, sum), by=.(product_type),]
  df_y = merge(df_j1,df_j2, by=c('product_type'))[, c('s_j') := list( equivalent_tonnes.x/equivalent_tonnes.y)][,list(county_id_hub, product_type, s_j)]
  df_d = merge(df_x,df_y,by=c('county_id_hub','product_type'))
  df_d[, c('distance','distance_w') := list(1/(distance_km+1), s_j/(distance_km+1) )]
  df_hub_size = df_d
  
  #Distance
  df_d=df_hub_size
  df_d=df_d[,list(spatial_id, product_type, distance, distance_w)][, lapply(.SD, sum), by=.(spatial_id, product_type),]

  #Concentration
  df_1 = fread(paste0("../../data/agribusiness/clean/beef_brazil.csv.gz"))
  df = df_1[year %in% year_list & id_type %in% id_type_list, ]
  df = df[exporter_group!='DOMESTIC CONSUMPTION',][destination!='BRAZIL',]
  setnames(df, old=spatial_unit, new='spatial_id')
  df_a = df[!is.na(spatial_id),][, list(country,spatial_id,state,exporter_group,equivalent_tonnes,fob_usd, product_type)]
  
  #Concentration ratios
  df_x = df_a[, lapply(.SD, sum), by=.(country, spatial_id, exporter_group), .SDcols=c('equivalent_tonnes','fob_usd')]
  df_y = df_a[, lapply(.SD, sum), by=.(country, spatial_id), .SDcols=c('equivalent_tonnes','fob_usd')]
  df = merge(df_x, df_y, by=c('country','spatial_id'))
  df[, c('share_q','q','share_v','v') := list( equivalent_tonnes.x/equivalent_tonnes.y, equivalent_tonnes.y, fob_usd.x/fob_usd.y, fob_usd.y ) ]
  df_s = df[, list(country,spatial_id,exporter_group,share_q,q,share_v,v)]
  
  df = df_s %>% group_by(country, spatial_id) %>% top_n(3, share_q)
  setDT(df)
  df = df[, lapply(.SD, sum), by=.(country, spatial_id), .SDcols=c('share_q')]
  setnames(df, c('share_q'),  c('cr3'))
  df_cr3 = df
  
  df = df_s %>% group_by(country, spatial_id) %>% top_n(1, share_q)
  setDT(df)
  df = df[, lapply(.SD, sum), by=.(country, spatial_id), .SDcols=c('q')]
  df_q = df
  
  #Merge all
  df_c = merge(df_cr3, df_q, by=c('country', 'spatial_id'))

  #Agribusiness FOB price
  df = df_a[product_type %in% c("BEEF BONELESS","BEEF OFFALS","BEEF DRIED SALTED SMOKED","CATTLE"), lapply(.SD, sum), by=.(country, product_type, spatial_id), .SDcols=c('equivalent_tonnes','fob_usd')]
  df = df[fob_usd>0 & equivalent_tonnes>0, ]
  df$price_a = df$fob_usd/df$equivalent_tonnes
  df_pa = df[, list(country, product_type, spatial_id, price_a)]
  
  #Farm prices
  df = fread(paste0("../../data/prices/clean/farmgate_cattle_brazil_",spatial_unit,".csv.gz"))
  df = df[year %in% year_list & establishment_type %in% c("all establishments") & value_type %in% c("breeders_p","calves_p"),][, price_f := value]
  df[, c('country') := list('BRAZIL')]
  df = df[, lapply(.SD, median), by=.(country, spatial_id), .SDcols=c('price_f')]
  df_pf = df[, list(country, spatial_id, price_f)]
  
  #Final dataset
  df = merge(df_c, df_pa, by=c('country', 'spatial_id'))
  df = merge(df, df_pf, by=c('country', 'spatial_id'))
  df = merge(df, df_d, by=c('product_type', 'spatial_id'))
  df$fshare = df$price_f/df$price_a
  df_final = df
  
  if(spatial_unit=='mesoregion_id'){df_final_meso=df_final }
  if(spatial_unit=='microregion_id'){df_final_micro=df_final }
  if(spatial_unit=='amc_id'){df_final_amc_id=df_final }
  
}


##### Regressions

for(spatial_unit in c('mesoregion_id','microregion_id','amc_id') ){
  
  if(spatial_unit=='mesoregion_id'){df = df_final_meso}
  if(spatial_unit=='microregion_id'){df = df_final_micro}
  if(spatial_unit=='amc_id'){df = df_final_amc_id}
  
  x_lab = 'CR-3'
  y_lab = 'farm-gate/agribusiness price ratio'
  
  df$y_var = df$fshare
  df$x_var = df$cr3
  df$w_var = df$q
  df$ma_var = df$distance_w
  lq = 0.025
  uq = 0.975
  df_ou = df[, .(lq_x=quantile(x_var,lq), uq_x=quantile(x_var,uq), iqr_x = IQR(x_var),
                       lq_y=quantile(y_var,lq), uq_y=quantile(y_var,uq), iqr_y = IQR(y_var)), by = .(country)]
  df = merge(df, df_ou, by=c('country') )
  df = df[x_var<1,]
  
  ols = lm_robust(y_var ~ x_var + factor(product_type), data=df, weights=w_var)
  ols_d = lm_robust(y_var ~ x_var + ma_var + factor(product_type), data=df, weights=w_var)

  if(spatial_unit=='mesoregion_id'){ols_meso=ols; ols_d_meso=ols_d }
  if(spatial_unit=='microregion_id'){ols_micro=ols; ols_d_micro=ols_d }
  if(spatial_unit=='amc_id'){ols_amc=ols; ols_d_amc=ols_d }
  
}


##### Table - Correlation

tab_title = paste0('Upstream market concentration and accounting-based markdowns.')
rows <- tribble(~term,  ~OLS, ~OLS,~OLS,~OLS,~OLS,~OLS,
                'Market definition',  'Mesoregion','Mesoregion','Microregion','Microregion','AMC','AMC',
                'Market access control','N','Y','N','Y','N','Y')
attr(rows, 'position') <- c(3)
gm <- tibble::tribble(~raw,        ~clean,          ~fmt,
                      "nobs",      "Observations",  0)
notes_content = paste0('Notes: ')
results_table_tex <- msummary(list('OLS'=ols_meso,'OLS'=ols_d_meso,'OLS'=ols_micro,'OLS'=ols_d_micro,'OLS'=ols_amc,'OLS'=ols_d_amc),
                              coef_map =  c('x_var'= 'CR-3'), coef_omit = 'Intercept',
                              title = paste0('\\label{tab: trase - concentration markdown correlation}',tab_title),
                              estimate = "{estimate}{stars}", gof_map = gm, escape=F,
                              add_rows = rows,
                              output='latex') %>% row_spec(1, bold = T)%>% add_header_above(c(" "=1, "farm-gate price/agribusiness price ratio" = 6))
kableExtra::save_kable(results_table_tex, file = paste0("../../output/tables/table2_correlation.tex"))



